{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Model Selection - Train Validation Split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from pyspark.ml.evaluation import RegressionEvaluator\n",
    "from pyspark.ml.regression import LinearRegression\n",
    "from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Prepare training and test data.\n",
    "data = spark.read.format(\"libsvm\")\\\n",
    "    .load(\"sample_linear_regression_data.txt\")\n",
    "train, test = data.randomSplit([0.9, 0.1], seed=12345)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "lr = LinearRegression(maxIter=10)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# We use a ParamGridBuilder to construct a grid of parameters to search over.\n",
    "# TrainValidationSplit will try all combinations of values and determine best model using\n",
    "# the evaluator.\n",
    "paramGrid = ParamGridBuilder()\\\n",
    "    .addGrid(lr.regParam, [0.1, 0.01]) \\\n",
    "    .addGrid(lr.fitIntercept, [False, True])\\\n",
    "    .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])\\\n",
    "    .build()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# In this case the estimator is simply the linear regression.\n",
    "# A TrainValidationSplit requires an Estimator, a set of Estimator ParamMaps, and an Evaluator.\n",
    "tvs = TrainValidationSplit(estimator=lr,\n",
    "                           estimatorParamMaps=paramGrid,\n",
    "                           evaluator=RegressionEvaluator(),\n",
    "                           # 80% of the data will be used for training, 20% for validation.\n",
    "                           trainRatio=0.8)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Run TrainValidationSplit, and choose the best set of parameters.\n",
    "model = tvs.fit(train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Make predictions on test data. model is the model with combination of parameters\n",
    "# that performed best.\n",
    "model.transform(test)\\\n",
    "    .select(\"features\", \"label\", \"prediction\")\\\n",
    "    .show()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Apache Toree - PySpark",
   "language": "python",
   "name": "apache_toree_pyspark"
  },
  "language_info": {
   "codemirror_mode": "text/x-ipython",
   "file_extension": ".py",
   "mimetype": "text/x-ipython",
   "name": "python",
   "pygments_lexer": "python",
   "version": "3.6.3\n"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
